Data Ingestion and Transformation


In [5]:
%matplotlib inline

import os
import json
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
# Load dataset
df = pd.read_csv('dataset-1min.csv')
# Drop duplicates
df = df.drop_duplicates()
df.head(3)


Out[6]:
datetime temperature temperature_f humidity co2 light light_status noise bluetooth_devices image_hist_change door_status occupancy_count occupancy_category
0 2017-03-25 09:05:00 22.60 72.68 36.9 781.0 430.000000 light-on 511.000000 1.0 15.242697 closed 0.0 very-low
2 2017-03-25 09:06:00 23.80 74.84 38.9 789.0 437.000000 light-on 491.000000 16.0 15.242697 closed 0.0 very-low
4 2017-03-25 09:07:00 23.85 74.93 38.8 766.0 421.653846 light-on 506.586957 25.0 15.242697 closed 15.0 low

In [7]:
# Determine the shape of the data
print("{} instances with {} features\n".format(*df.shape))

# Determine the frequency of each class
print(df.groupby('occupancy_category')['occupancy_category'].count())


8332 instances with 13 features

occupancy_category
fair         351
high        3342
low          210
very-low    4429
Name: occupancy_category, dtype: int64

In [8]:
# Helper function to encode occupancy_category based on the number of people 
def occupancy(df):
  if df['occupancy_category'] == 'very-low':
    return '1'
  elif df['occupancy_category'] == 'low':
    return '2'
  elif df['occupancy_category'] == 'fair':
    return '3'
  else:
    return '4'

df['occupancy_code'] = df.apply(occupancy, axis=1)
df.head(3)


Out[8]:
datetime temperature temperature_f humidity co2 light light_status noise bluetooth_devices image_hist_change door_status occupancy_count occupancy_category occupancy_code
0 2017-03-25 09:05:00 22.60 72.68 36.9 781.0 430.000000 light-on 511.000000 1.0 15.242697 closed 0.0 very-low 1
2 2017-03-25 09:06:00 23.80 74.84 38.9 789.0 437.000000 light-on 491.000000 16.0 15.242697 closed 0.0 very-low 1
4 2017-03-25 09:07:00 23.85 74.93 38.8 766.0 421.653846 light-on 506.586957 25.0 15.242697 closed 15.0 low 2

In [9]:
# Read the data into a DataFrame
features = [
    'temperature',
    'humidity',
    'co2',
    'light',
    'noise',
    'bluetooth_devices',
    'occupancy_code'
]
classes = [
    'very-low',
    'low',
    'fair',
    'high'
] 

df = df[features]
df.shape


Out[9]:
(8332, 7)

In [10]:
# Extract the target from the data
data   = df.ix[:, 0:-1]
target = df.ix[:, -1]

print(data.shape)
print(target.shape)


(8332, 6)
(8332,)

In [11]:
# Split into test and train data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target)

In [12]:
# Standarize data
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)

Classification Models Evaluation


In [13]:
from sklearn import metrics
from sklearn.cross_validation import KFold
from yellowbrick.classifier import ClassificationReport, ROCAUC, ClassBalance

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [14]:
def fit_and_evaluate(dataset, model, label, **kwargs):
    """
    Because of the Scikit-Learn API, we can create a function to
    do all of the fit and evaluate work on our behalf!
    """
    start  = time.time() # Start the clock! 
    scores = {'precision':[], 'recall':[], 'accuracy':[], 'f1':[]}
    
    for train, test in KFold(data.shape[0], n_folds=12, shuffle=True):
        
        estimator = model(**kwargs)
        estimator.fit(X_train, y_train)
        
        expected  = y_test
        predicted = estimator.predict(X_test)
        
        # Append our scores to the tracker
        scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
        scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
        scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
        scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))

    # Report
    print("Build and Validation of {} took {:0.3f} seconds".format(label, time.time()-start))
    print("Validation scores are as follows:\n")
    print(pd.DataFrame(scores).mean())

Support Vector Classfier


In [15]:
# Perform SVC Classification
svc = SVC()
fit_and_evaluate(df, SVC, "SVM Classifier")


Build and Validation of SVM Classifier took 5.500 seconds
Validation scores are as follows:

accuracy     0.924148
f1           0.898368
precision    0.929549
recall       0.924148
dtype: float64

In [16]:
visualizer = ClassificationReport(svc, classes=classes)

visualizer.fit(X_train, y_train)  
visualizer.score(X_test, y_test)  
g = visualizer.poof()


K Nearest Neighbors Classifier


In [28]:
# Perform kNN Classification
knn = KNeighborsClassifier()
fit_and_evaluate(df, KNeighborsClassifier, "kNN Classifier", n_neighbors=12)


Build and Validation of kNN Classifier took 0.780 seconds
Validation scores are as follows:

accuracy     0.944311
f1           0.937485
precision    0.941137
recall       0.944311
dtype: float64

In [29]:
visualizer = ClassificationReport(knn, classes=classes)

visualizer.fit(X_train, y_train)  
visualizer.score(X_test, y_test)  
g = visualizer.poof()


Random Forest Classfier


In [30]:
# Perform Random Forest Classification
rfc = RandomForestClassifier()
fit_and_evaluate(df, RandomForestClassifier, "Random Forest Classifier")


Build and Validation of Random Forest Classifier took 1.338 seconds
Validation scores are as follows:

accuracy     0.955593
f1           0.953264
precision    0.952740
recall       0.955593
dtype: float64

In [31]:
visualizer = ClassificationReport(rfc, classes=classes)

visualizer.fit(X_train, y_train)  
visualizer.score(X_test, y_test)  
g = visualizer.poof()



In [ ]: